pacman::p_load(ggplot2, dplyr, party, caret, corrplot)
products2017 <- read.csv("~/Documents/@/UBIQUM/DATAML/Week6/Profitability/Profitability/DATASETS/existingproductattributes2017.csv")
products2018 <- read.csv("~/Documents/@/UBIQUM/DATAML/Week6/Profitability/Profitability/DATASETS/existingproductattributesPast.csv")
#Remove Na
# ncol(is.na(products2017))
# summary(is.na(products2017))
# na.omit(products2017)
# summary(products2017)
# complete.cases(products2017)
# str(products2017)
# products2017 <- products2017[complete.cases(products2017),]
# str(products2017)
#Remove the column with Na
products2017$BestSellersRank <- NULL
#Find patterns in the data
#Select only numeric values Just for the PLOTS and the CORRELATION MATRIX V.1
products2017Num <- products2017[c(-1,-2)]
str(products2017Num)
## 'data.frame': 80 obs. of 15 variables:
## $ Price : num 949 2250 399 410 1080 ...
## $ x5StarReviews : int 3 2 3 49 58 83 11 33 16 10 ...
## $ x4StarReviews : int 3 1 0 19 31 30 3 19 9 1 ...
## $ x3StarReviews : int 2 0 0 8 11 10 0 12 2 1 ...
## $ x2StarReviews : int 0 0 0 3 7 9 0 5 0 0 ...
## $ x1StarReviews : int 0 0 0 9 36 40 1 9 2 0 ...
## $ PositiveServiceReview: int 2 1 1 7 7 12 3 5 2 2 ...
## $ NegativeServiceReview: int 0 0 0 8 20 5 0 3 1 0 ...
## $ Recommendproduct : num 0.9 0.9 0.9 0.8 0.7 0.3 0.9 0.7 0.8 0.9 ...
## $ ShippingWeight : num 25.8 50 17.4 5.7 7 1.6 7.3 12 1.8 0.75 ...
## $ ProductDepth : num 23.9 35 10.5 15 12.9 ...
## $ ProductWidth : num 6.62 31.75 8.3 9.9 0.3 ...
## $ ProductHeight : num 16.9 19 10.2 1.3 8.9 ...
## $ ProfitMargin : num 0.15 0.25 0.08 0.08 0.09 0.05 0.05 0.05 0.05 0.05 ...
## $ Volume : int 12 8 12 196 232 332 44 132 64 40 ...
# Visualize the data. First approach
# for (col in 1:ncol(products2017Num)) {
# hist(products2017Num[,col], main = names(products2017Num[col]), xlab=colnames(products2017Num[col]), border = "black", col = "#bcd4e6", breaks=50 )}
#
# for (col in 1:ncol(products2017Num)) {
# boxplot(products2017Num[,col], main = names(products2017Num[col]), xlab=colnames(products2017Num[col]), border = "black", col = "#bcd4e6" )}
#Decision Tree 1
products2017_ctree <- ctree(Volume ~ ., data=products2017, controls = ctree_control(maxdepth = 4))
plot(products2017_ctree)

#Correlation Matrix
corrData <- cor(products2017Num)
corrplot(corrData)

#Select variables. A. Without 5StarReviews
products2017CombA <- products2017[c(-2, -4)]
#Decision Tree. A
products2017_ctreeA <- ctree(Volume ~ ., data=products2017CombA, controls = ctree_control(maxdepth = 4))
plot(products2017_ctreeA)

#Positive and 4StarReviews are the most important variables.
#FEATURE ENGINEERING
#Join the Stars Attributes because they are very correlated
products2017CombMerg <- products2017
products2017CombMerg <- products2017[c(-4:-8)]
products2017CombMerg$GoodReviews <- products2017$x4StarReviews + products2017$x3StarReviews
products2017CombMerg$BadReviews <- products2017$x2StarReviews + products2017$x1StarReviews
#Decision Tree B
products2017_ctreeB <- ctree(Volume ~ ., data=products2017CombMerg, controls = ctree_control(maxdepth = 10))
plot(products2017_ctreeB)

#Select variables:: Feature Eng. B.2 Without 5StarReviews, Without Categorical Variables
products2017CombMerg2 <- products2017CombMerg[c(-1, -2)]
#Correlation Matrix B
corrDataMerg2 <- cor(products2017CombMerg2)
corrplot(corrDataMerg2)

# Visualize the data. B
for (col in 1:ncol(products2017CombMerg2)) {
hist(products2017CombMerg2[,col], main = names(products2017CombMerg2[col]), xlab=colnames(products2017CombMerg2[col]), border = "black", col = "#bcd4e6", breaks=200 )}












#There are some outliers in PositiveServiceReview and in the GoodReviews and BadReviews.
#So let's analyze how is their distribution
GoodReviewsNums <- products2017CombMerg %>% group_by(GoodReviews) %>% summarise (n = n())
GoodReviewsOutliers <- products2017CombMerg %>% filter(GoodReviews == '35')
#Products from 134 to 141 are duplicated (the only difference between them is the price)
#So let's remove them
#Select Rows:: Feature Eng. C Without duplicated rows
products2017CombMergClean <- products2017CombMerg[-c(34:41),]
#Decision Tree C
products2017_ctreeC <- ctree(Volume ~ ., data=products2017CombMergClean, controls = ctree_control(maxdepth = 10))
plot(products2017_ctreeC)

#Creating dummy variables------
#Select variables:: Feature Eng. C.2 Without Categorical Variables
products2017CombMergClean2 <- products2017CombMergClean[c(-1, -2)]
# Visualize the data. C
for (col in 1:ncol(products2017CombMergClean2)) {
hist(products2017CombMergClean2[,col], main = names(products2017CombMergClean2[col]), xlab=colnames(products2017CombMergClean2[col]), border = "black", col = "#bcd4e6", breaks=200 )}












ggplot(products2017CombMergClean, aes(x=ProductType, fill=Volume)) + geom_bar()

#Correlation Matrix C
corrDataMergClean2 <- cor(products2017CombMergClean2)
corrplot(corrDataMergClean2)

#detect outliers for 4StarReviews
x5StarNums <- products2017 %>% group_by(x5StarReviews) %>% summarise (n = n())
x5StarOutliers <- products2017 %>% filter(x5StarReviews > 1000) #Product 150, 198
x5StarOutliers2 <- products2017 %>% filter(x5StarReviews == '308' )
#Product 150, 198
x4StarOutliers <- products2017 %>% filter(x4StarReviews > 100)
#priceNums <- products2017 %>% group_by(Price) %>% summarise (n = n())
###the same but in different way >>> priceNums <- summarise(group_by(products2017Num, Price), (n = n()))
NegativeNums <- products2017 %>% group_by(NegativeServiceReview) %>% summarise (n = n())
PositiveNums <- products2017 %>% group_by(PositiveServiceReview) %>% summarise (n = n())
#detect outliers for PositiveServiceReview
positiveServiceOutliers <- products2017 %>% filter(PositiveServiceReview == '280')
#remove outliers
#products2017 <- products2017[-c(36:41),]
#detect outliers for NegativeServiceReview
negativeServiceOutliers <- products2017 %>% filter(NegativeServiceReview == '112')
#remove outlier because is clearly more different in every attribute
#products2017 <- products2017[-c(23),]
hist(products2017$PositiveServiceReview)

# box_plot <- boxplot(products2017[, c("","")])
#sort(products2017Num$NegativeServiceReview)